import pandas as pd
import numpy as np
from pandas_profiling import ProfileReport
import seaborn as sns
import altair as alt
%run src/data_download.py --url=https://archive.ics.uci.edu/ml/machine-learning-databases/wine-quality/winequality-red.csv --delim=";" --filepath=data/ --filename=redwine
data_red = pd.read_csv("data/redwine", ",")
data_red = data_red.rename(columns=lambda x: x.replace(" ","_"))
data_red['quality_level'] = ["Excellent" if x >= 7 else "Good" if x >= 5 else "Bad" for x in data_red['quality']]
data_red['wine_type']= "red"
data_red.head()
| fixed_acidity | volatile_acidity | citric_acid | residual_sugar | chlorides | free_sulfur_dioxide | total_sulfur_dioxide | density | pH | sulphates | alcohol | quality | quality_level | wine_type | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 7.4 | 0.70 | 0.00 | 1.9 | 0.076 | 11.0 | 34.0 | 0.9978 | 3.51 | 0.56 | 9.4 | 5 | Good | red |
| 1 | 7.8 | 0.88 | 0.00 | 2.6 | 0.098 | 25.0 | 67.0 | 0.9968 | 3.20 | 0.68 | 9.8 | 5 | Good | red |
| 2 | 7.8 | 0.76 | 0.04 | 2.3 | 0.092 | 15.0 | 54.0 | 0.9970 | 3.26 | 0.65 | 9.8 | 5 | Good | red |
| 3 | 11.2 | 0.28 | 0.56 | 1.9 | 0.075 | 17.0 | 60.0 | 0.9980 | 3.16 | 0.58 | 9.8 | 6 | Good | red |
| 4 | 7.4 | 0.70 | 0.00 | 1.9 | 0.076 | 11.0 | 34.0 | 0.9978 | 3.51 | 0.56 | 9.4 | 5 | Good | red |
data_red.info()
<class 'pandas.core.frame.DataFrame'> RangeIndex: 1599 entries, 0 to 1598 Data columns (total 14 columns): # Column Non-Null Count Dtype --- ------ -------------- ----- 0 fixed_acidity 1599 non-null float64 1 volatile_acidity 1599 non-null float64 2 citric_acid 1599 non-null float64 3 residual_sugar 1599 non-null float64 4 chlorides 1599 non-null float64 5 free_sulfur_dioxide 1599 non-null float64 6 total_sulfur_dioxide 1599 non-null float64 7 density 1599 non-null float64 8 pH 1599 non-null float64 9 sulphates 1599 non-null float64 10 alcohol 1599 non-null float64 11 quality 1599 non-null int64 12 quality_level 1599 non-null object 13 wine_type 1599 non-null object dtypes: float64(11), int64(1), object(2) memory usage: 175.0+ KB
alt.Chart(data_red).mark_bar().encode(
alt.X('quality_level'),
alt.Y('count()')
).properties(width = 200)
%run src/data_download.py --url=https://archive.ics.uci.edu/ml/machine-learning-databases/wine-quality/winequality-white.csv --delim=";" --filepath=data/ --filename=whitewine
data_white= pd.read_csv("data/whitewine", ",")
data_white = data_white.rename(columns=lambda x: x.replace(" ","_"))
data_white['quality_level'] = ["Excellent" if x >= 7 else "Good" if x >= 5 else "Bad" for x in data_white['quality']]
data_white['wine_type']= "white"
data_white.head()
| fixed_acidity | volatile_acidity | citric_acid | residual_sugar | chlorides | free_sulfur_dioxide | total_sulfur_dioxide | density | pH | sulphates | alcohol | quality | quality_level | wine_type | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 7.0 | 0.27 | 0.36 | 20.7 | 0.045 | 45.0 | 170.0 | 1.0010 | 3.00 | 0.45 | 8.8 | 6 | Good | white |
| 1 | 6.3 | 0.30 | 0.34 | 1.6 | 0.049 | 14.0 | 132.0 | 0.9940 | 3.30 | 0.49 | 9.5 | 6 | Good | white |
| 2 | 8.1 | 0.28 | 0.40 | 6.9 | 0.050 | 30.0 | 97.0 | 0.9951 | 3.26 | 0.44 | 10.1 | 6 | Good | white |
| 3 | 7.2 | 0.23 | 0.32 | 8.5 | 0.058 | 47.0 | 186.0 | 0.9956 | 3.19 | 0.40 | 9.9 | 6 | Good | white |
| 4 | 7.2 | 0.23 | 0.32 | 8.5 | 0.058 | 47.0 | 186.0 | 0.9956 | 3.19 | 0.40 | 9.9 | 6 | Good | white |
alt.Chart(data_white).mark_bar().encode(
alt.X('quality_level'),
alt.Y('count()')
).properties(width = 200)
data_red.shape
(1599, 14)
data_white.shape
(4898, 14)
full_data = pd.concat([data_white, data_red], axis=0)
full_data.shape
(6497, 14)
full_data.head(4)
| fixed_acidity | volatile_acidity | citric_acid | residual_sugar | chlorides | free_sulfur_dioxide | total_sulfur_dioxide | density | pH | sulphates | alcohol | quality | quality_level | wine_type | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 7.0 | 0.27 | 0.36 | 20.7 | 0.045 | 45.0 | 170.0 | 1.0010 | 3.00 | 0.45 | 8.8 | 6 | Good | white |
| 1 | 6.3 | 0.30 | 0.34 | 1.6 | 0.049 | 14.0 | 132.0 | 0.9940 | 3.30 | 0.49 | 9.5 | 6 | Good | white |
| 2 | 8.1 | 0.28 | 0.40 | 6.9 | 0.050 | 30.0 | 97.0 | 0.9951 | 3.26 | 0.44 | 10.1 | 6 | Good | white |
| 3 | 7.2 | 0.23 | 0.32 | 8.5 | 0.058 | 47.0 | 186.0 | 0.9956 | 3.19 | 0.40 | 9.9 | 6 | Good | white |
profile = ProfileReport(full_data)
profile